package com.ipan.poi.word;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.List;

import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.FactoryConfigurationError;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;

import org.apache.commons.lang.StringUtils;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.converter.PicturesManager;
import org.apache.poi.hwpf.converter.WordToHtmlConverter;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.hwpf.usermodel.PictureType;
import org.apache.poi.hwpf.usermodel.Range;
import org.apache.poi.hwpf.usermodel.Table;
import org.apache.poi.hwpf.usermodel.TableIterator;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.xwpf.converter.core.FileImageExtractor;
import org.apache.poi.xwpf.converter.core.FileURIResolver;
import org.apache.poi.xwpf.converter.xhtml.XHTMLConverter;
import org.apache.poi.xwpf.converter.xhtml.XHTMLOptions;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
import org.apache.poi.xwpf.usermodel.XWPFTable;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;

import com.ipan.poi.utils.PoiFileHelper;

/**
 * Word文档工具类
 * 
 * @author iPan
 * @version 2013-9-16
 */
public class WordUtils {
	
	private static Logger logger = LoggerFactory.getLogger(WordUtils.class);

	/**
	 * 获取word1996-word2007文档所有内容，通用接口；
	 */
	public static String getText(File file) {
		String extension = PoiFileHelper.getFileExtension(file.getName());
		String result = null;
		if ("doc".equals(extension)) {
			WordExtractor extractor = null;
			try {
				extractor = new WordExtractor(new POIFSFileSystem(new FileInputStream(file)));
				result = extractor.getText();
			} catch (Exception e) {
				logger.error("创建doc抽取器失败！", e);
			}
			
		} else if ("docx".equals(extension)) {
			XWPFWordExtractor extractor = null;
			try {
				extractor = new XWPFWordExtractor(OPCPackage.open(file));
				result = extractor.getText();
			} catch (Exception e) {
				logger.error("创建docx抽取器失败！", e);
			}
		} else {
			throw new RuntimeException("文件\"" + file.getName() + "\"无法解析！");
		}
		return result;
	}
	
	/**
	 * 获取word1996-word2003文档表格列表；
	 */
	public static List<Table> getHwpfTables(File file) {
		if (!PoiFileHelper.checkFileExtension(file.getName(), "doc")) {
			throw new RuntimeException("文件格式错误！");
		}
		HWPFDocument document = null;
		try {
			document = new HWPFDocument(new POIFSFileSystem(new FileInputStream(file)));
		} catch (Exception e) {
			throw new RuntimeException("创建HWPFDocument失败！");
		}
		Range range = document.getRange();
		TableIterator iter = new TableIterator(range);
		List<Table> tableList = new ArrayList<Table>();
		while (iter.hasNext()) {
			Table table = iter.next();
			tableList.add(table);
		}
		return tableList;
	}
	
	/**
	 * 获取word2007文档表格列表；
	 */
	public static List<XWPFTable> getXwpfTables(File file) {
		if (!PoiFileHelper.checkFileExtension(file.getName(), "docx")) {
			throw new RuntimeException("文件格式错误！");
		}
		XWPFDocument document = null;
		try {
			document = new XWPFDocument(OPCPackage.open(file));
		} catch (Exception e) {
			throw new RuntimeException("创建XWPFDocument失败！");
		}
		return document.getTables();
	}
	
	/**
	 * 获取word1996-word2003文档段落列表；
	 * 注意，解析的时候所有段落都会被解析，包括表格内部的段落；
	 */
	public static List<String> getHwpfParagraph(File file) {
		if (!PoiFileHelper.checkFileExtension(file.getName(), "doc")) {
			throw new RuntimeException("文件格式错误！");
		}
		String[] paragraphText = null;
		WordExtractor extractor = null;
		try {
			extractor = new WordExtractor(new POIFSFileSystem(new FileInputStream(file)));
			paragraphText = extractor.getParagraphText();
		} catch (Exception e) {
			logger.error("创建doc抽取器失败！", e);
		}
		List<String> result = new ArrayList<String>((paragraphText == null) ? 0 : paragraphText.length);
		for (int i=0; i<paragraphText.length; ++i) {
			result.add(paragraphText[i].trim());
		}
		return result;
	}
	
	/**
	 * 获取word2007文档段落列表；
	 * 注意：不会解析表格内部的段落；
	 */
	public static List<String> getXwpfParagraph(File file) {
		if (!PoiFileHelper.checkFileExtension(file.getName(), "docx")) {
			throw new RuntimeException("文件格式错误！");
		}
		XWPFDocument document = null;
		try {
			document = new XWPFDocument(OPCPackage.open(file));
		} catch (Exception e) {
			throw new RuntimeException("创建XWPFDocument失败！");
		}
		List<XWPFParagraph> paragraphsList = document.getParagraphs();
		List<String> result = new ArrayList<String>();
		for (int i=0, len=paragraphsList.size(); i<len; ++i) {
			result.add(paragraphsList.get(i).getText());
		}
		return result;
	}

	/**
	 * 去掉所有空白；
	 * 2003解析表格文本时候，需要处理过；
	 */
	public static String trimAll(String text) {
		return (text == null) ? null : text.trim().replaceAll("([\\s ])*", "");
	}
	
	/**
	 * 将word文档转换为HTML文档
	 * 
	 * @param file word文档（支持doc、docx）
	 * @param out HTML文档输出流
	 * @param imageDirPath 文档图片目录
	 */
	public static void wordToHtml(File file, OutputStream out, String imageDirPath) {
		try {
			if (isDocFile(file)) {
				docToHtml(file, out, imageDirPath);
			} else if (isDocxFile(file)) {
				docxToHtml(file, out, imageDirPath);
			} else {
				throw new RuntimeException("文件\"" + file.getName() + "\"无法解析！");
			}
		} catch (Exception e) {
			throw new RuntimeException("word转HTML文档出错.", e);
		}
	}
	
	protected static void docToHtml(File file, OutputStream out, final String imageDirPath) throws IOException, TransformerException {
		// 加载文档
		HWPFDocument wordDocument = new HWPFDocument(new FileInputStream(file));
		Document document = null;
		try {
			document = DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument();
		} catch (ParserConfigurationException e1) {
			throw new RuntimeException(e1);
		} catch (FactoryConfigurationError e1) {
			throw new RuntimeException(e1);
		}

		WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(document);
		// 设置图片保存处理
		wordToHtmlConverter.setPicturesManager(new PicturesManager() {
			public String savePicture(byte[] content, PictureType pictureType, String suggestedName, float widthInches, float heightInches) {
				String picPath = null;
				if (StringUtils.isNotBlank(imageDirPath)) {
					picPath = imageDirPath.replace("\\", "/");
					picPath = (picPath.endsWith("/")) ? picPath : picPath + "/";
					checkAndCreateDirectory(new File(picPath));
					picPath += suggestedName;
					File picFile = new File(picPath);
					FileOutputStream picOut = null;
                    try {  
                        picOut = new FileOutputStream(picFile);  
                        picOut.write(content);  
                    } catch (Exception e) {  
                    	throw new RuntimeException(e); 
                    } finally {
                    	if (picOut != null) {
                    		try {
								picOut.close();
							} catch (IOException e) {
							}
                    	}
                    }
				}
				return (StringUtils.isBlank(picPath)) ? 
						"" : (picPath.indexOf(":") > 0) ? 
								picPath.substring(picPath.indexOf(":") + 1) : picPath;
			}
		});
		// 处理文档
		wordToHtmlConverter.processDocument(wordDocument);
		// 输出HTML文件
		Document htmlDocument = wordToHtmlConverter.getDocument();
		DOMSource domSource = new DOMSource(htmlDocument);
		StreamResult streamResult = new StreamResult(out);
		Transformer serializer = TransformerFactory.newInstance().newTransformer();
		serializer.setOutputProperty(OutputKeys.ENCODING, "UTF-8");
		serializer.setOutputProperty(OutputKeys.INDENT, "yes");
		serializer.setOutputProperty(OutputKeys.METHOD, "html");
		serializer.transform(domSource, streamResult);
	}
	
	protected static void docxToHtml(File file, OutputStream out, String imageDirPath) throws IOException {
		// 加载文档
		XWPFDocument document = new XWPFDocument(new FileInputStream(file));

		// 设置文件保存处理
		XHTMLOptions options = null;
		if (StringUtils.isNotBlank(imageDirPath)) {
			File picDir = new File(imageDirPath);
			checkAndCreateDirectory(picDir);
			options = XHTMLOptions.create().URIResolver(new FileURIResolver(picDir));
			options.setExtractor(new FileImageExtractor(picDir));
		}
		
		// 转换并输出HTML文件
		XHTMLConverter.getInstance().convert(document, out, options);
	}
	
	protected static boolean isDocFile(File file) {
		String filePath = file.getName();
		int index = filePath.lastIndexOf('.');
		String extension = (index < 0) ? filePath : filePath.substring(index + 1).toLowerCase();
		return "doc".equals(extension);
	}
	
	protected static boolean isDocxFile(File file) {
		String filePath = file.getName();
		int index = filePath.lastIndexOf('.');
		String extension = (index < 0) ? filePath : filePath.substring(index + 1).toLowerCase();
		return "docx".equals(extension);
	}
	
	protected static void checkAndCreateDirectory(File dir) {
		if (dir.exists()) {
			return ;
		}
		checkAndCreateDirectory(dir.getParentFile());
		dir.mkdir();
	}
	
//	public static void main(String[] args) {
		// 写入文件测试
//		OutputStream fileOut = null;
//		File file;
//		try {
//			fileOut = new FileOutputStream("d:/dfxy.html");
//			file = new File("d:/dfxy.doc");
//			wordToHtml(file, fileOut, "d:/doc_img");
//		} catch (FileNotFoundException e) {
//			e.printStackTrace();
//		} finally {
//			if (fileOut != null) {
//				try {
//					fileOut.close();
//				} catch (IOException e) {
//				}
//			}
//		}
		// 输出字符串测试
//		OutputStream byteBuf = new ByteArrayOutputStream();
//		File file = new File("d:/dfxy.doc");
//		wordToHtml(file, byteBuf, "d:/doc_img");
//		System.out.println(byteBuf.toString());
//	}
	
}
